Generating the CellDataSet object
# Rename bidirectionally promoted lncRNAs and antisense lncRNAs to
# 'lincRNAs'
fData(dat)$ccdsid <- gsub("antisense", "lincRNA", fData(dat)$ccdsid)
fData(dat)$ccdsid <- gsub("bidirectional_promoter_lncRNA", "lincRNA", fData(dat)$ccdsid)
# Calculate the mean copies per cell among all classes (Bulk) and draw the
# density plot for lincRNAs vs protein coding
dat.means <- detectGenes(dat, min_expr = 1)
dat.means <- dat.means[fData(dat.means)$num_cells_expressed >= 2]
fData(dat.means)$mean_cpc <- apply(exprs(dat.means), 1, mean)
fData(dat.means)$sd <- apply(exprs(dat.means), 1, sd)
fData(dat.means)$BCV <- (fData(dat.means)$sd/fData(dat.means)$mean_cpc)^2
tmp <- data.frame(gene_short_name = fData(dat.means)$gene_short_name, gene_type = fData(dat.means)$ccdsid,
mean_cpc = fData(dat.means)$mean_cpc, BCV = fData(dat.means)$BCV, num_cells_expresed = fData(dat.means)$num_cells_expressed)
dat_means <- subset(tmp, gene_type %in% c("protein_coding", "lincRNA"))
density.plot <- ggplot(dat_means) + geom_density(aes(x = log10(mean_cpc), color = gene_type)) +
scale_color_manual(values = c("red", "black")) + theme_bw()
density.plot
# Subset on lncRNAs and mRNAs
dat_lincRNA_sort <- subset(dat_means, gene_type %in% "lincRNA")
dat_mRNA_sort <- subset(dat_means, gene_type %in% "protein_coding")
# Number of lncRNAs
length(dat_lincRNA_sort$gene_short_name)
## [1] 339
# Number of mRNAs
length(dat_mRNA_sort$gene_short_name)
## [1] 14308
# Generate a boxplot on the batch data
dat_means$frac_cells_expressed <- dat_means$num_cells_expresed/length(rownames(pData(dat.means)))
boxplot <- ggplot(dat_means, aes(gene_type, frac_cells_expressed, fill = gene_type)) +
geom_boxplot(notch = T) + theme_bw() + scale_fill_manual(values = c("red",
"grey50"))
boxplot
# Generate empirical cumulative density function
ecdf <- ggplot(dat_means, aes(x = log10(BCV), color = gene_type)) + stat_ecdf(geom = "line") +
theme_bw() + scale_color_manual(values = c("red", "black"))
ecdf
# test dotplot
dot <- ggplot(dat_means, aes(x = log10(mean_cpc), y = frac_cells_expressed,
color = gene_type)) + geom_point() + facet_wrap(~gene_type) + theme_bw() +
scale_color_manual(values = c("red", "black"))
dot
dat.sort <- detectGenes(dat, min_expr = 1)
dat.sort <- dat.sort[fData(dat.sort)$num_cells_expressed >= 2]
fData(dat.sort)$mean_cpc_expressed_only <- esApply(dat.sort, 1, function(x) {
tmp <- x[x > 0]
mean(tmp, na.rm = T)
})
dat.sort <- subset(dat.sort, fData(dat.sort)$ccdsid %in% c("lincRNA", "protein_coding"))
expressed_only_density <- ggplot(fData(dat.sort), aes(x = log10(mean_cpc_expressed_only),
color = ccdsid)) + geom_density() + theme_bw() + scale_color_manual(values = c("red",
"black"))
expressed_only_density
# Density estimates
lnc.dens <- density(subset(dat_means, gene_type %in% "lincRNA")$mean_cpc)
log.lnc.dens <- density(log10(subset(dat_means, gene_type %in% "lincRNA")$mean_cpc))
PC.dens <- density(subset(dat_means, gene_type %in% "protein_coding")$mean_cpc)
log.PC.dens <- density(log10(subset(dat_means, gene_type %in% "protein_coding")$mean_cpc))
# Learn the distribution of lncRNAs
learned.lnc.den <- DiscreteDistribution(subset(dat_means, gene_type %in% "lincRNA")$mean_cpc)
# Create weighted probabilities on PC gene FPKM values from which to sample
PC.weights.on.lnc.D <- p(learned.lnc.den)(subset(dat_means, gene_type %in% "protein_coding")$mean_cpc,
lower.tail = FALSE)
PC.probs.on.lnc.D <- PC.weights.on.lnc.D/sum(PC.weights.on.lnc.D)
# Sample from PC genes to match lncRNA distribution
samp_PC <- sample(subset(dat_means, gene_type %in% "protein_coding")$mean_cpc,
replace = T, size = 339, prob = PC.probs.on.lnc.D)
# My Sampling function
mySample <- function(x, n, EmpDist) {
w <- p(EmpDist)(x$mean_cpc, lower.tail = FALSE)
probs <- w/sum(w)
samp <- x[sample(nrow(x), replace = FALSE, size = n, prob = probs), ]
return(samp)
}
# Check plot
plot(density(log10(mySample(subset(dat_means, gene_type %in% "protein_coding"),
n = 339, EmpDist = learned.lnc.den)$mean_cpc)), main = "Sampling mRNAs from the lncRNA expression distribution")
lines(density(log10(r(learned.lnc.den)(339))), col = "blue")
lines(log.lnc.dens, col = "red")
lines(log.PC.dens, col = "green")
legend(x = -0.5, y = 0.5, legend = c("lncRNA distribution", "Random draws from learned dist",
"Sampled mRNAs from learned dist"), col = c("red", "blue", "black"), lty = 1)
# Generate a subset of dat_means that is only mRNAs that fit the learned
# lncRNA expression distribution
mRNA.sample <- dat_means[dat_means$gene_short_name %in% sample(subset(dat_means,
gene_type %in% "protein_coding")$gene_short_name, replace = T, size = 339,
prob = PC.probs.on.lnc.D), ]
# change gene_type to sampled for sampled mRNAs and append it to dat_means
mRNA.sample$gene_type <- "sampled"
dat_sampled <- rbind(dat_means, mRNA.sample)
# Subset on TFs
TFs.sample <- subset(dat_means, dat_means$gene_short_name %in% TFs)
TFs.sample$gene_type <- "TFs"
# Add TFs to dat_sampled
dat_sampled <- rbind(dat_sampled, TFs.sample)
# Draw a new barplot
p1 <- ggplot(dat_sampled, aes(gene_type, frac_cells_expressed, fill = gene_type)) +
geom_boxplot(notch = T) + theme_bw() + scale_fill_manual(values = c("red",
"grey50", "darkblue", "#33CC00"))
p1
# Draw a new ecdf plot
p2 <- ggplot(dat_sampled, aes(x = log10(BCV), color = gene_type)) + stat_ecdf(geom = "line") +
theme_bw() + scale_color_manual(values = c("red", "black", "darkblue", "#33CC00"))
p2
# Draw a new density plot
p3 <- ggplot(dat_sampled, aes(x = log10(mean_cpc), color = gene_type)) + geom_density() +
theme_bw() + scale_color_manual(values = c("red", "black", "darkblue", "#33CC00"))
p3
# Draw a tSNE using sample mRNAs from above
subsetmRNA <- subset(dat, fData(dat)$gene_short_name %in% mRNA.sample$gene_short_name)
sampled.dat.tSNE <- Rtsne(t(round(vstExprs(subsetmRNA))), theta = 0.1, check_duplicates = F)
pData(subsetmRNA)$tSNE1_pos <- sampled.dat.tSNE$Y[, 1]
pData(subsetmRNA)$tSNE2_pos <- sampled.dat.tSNE$Y[, 2]
q <- ggplot(pData(subsetmRNA))
sampled_tSNE <- q + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, color = group_num)) +
theme_bw() + coord_equal(1) + scale_color_brewer(palette = "Set1")
sampled_tSNE
# There doesn't seem to be any more or less organization present here. May
# be worth bootstrapping and quantfitying how well each of the tSNEs cluster
# the data points.
# tSNE using all genes
dat.tSNE <- Rtsne(t(round(vstExprs(dat))), theta = 0.1)
pData(dat)$tSNE1_pos <- dat.tSNE$Y[, 1]
pData(dat)$tSNE2_pos <- dat.tSNE$Y[, 2]
p <- ggplot(pData(dat))
tSNE_cluster <- p + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, color = group_num)) +
theme_bw() + coord_equal(1) + scale_color_brewer(palette = "Set1")
tSNE_cluster
tSNE_level2class <- p + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, color = level2class)) +
theme_bw() + coord_equal(1) + scale_color_manual(values = colorRampPalette(brewer.pal(9,
"Set1"))(length(unique(pData(tmp)$level2class))))
# tSNE using only protein coding genes
pcgenes <- subset(dat, fData(dat)$ccdsid %in% "protein_coding")
protein_coding.dat.tSNE <- Rtsne(t(round(vstExprs(pcgenes))), theta = 0.1, check_duplicates = F)
pData(pcgenes)$tSNE1_pos <- protein_coding.dat.tSNE$Y[, 1]
pData(pcgenes)$tSNE2_pos <- protein_coding.dat.tSNE$Y[, 2]
q <- ggplot(pData(pcgenes))
protein_coding_tSNE <- q + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, color = group_num)) +
theme_bw() + coord_equal(1) + scale_color_brewer(palette = "Set1")
protein_coding_tSNE
# tSNE using only lncRNAs
lncgenes <- subset(dat, fData(dat)$ccdsid %in% "lincRNA")
lincRNA.dat.tSNE <- Rtsne(t(round(vstExprs(lncgenes))), theta = 0.1, check_duplicates = F)
pData(lncgenes)$tSNE1_pos <- lincRNA.dat.tSNE$Y[, 1]
pData(lncgenes)$tSNE2_pos <- lincRNA.dat.tSNE$Y[, 2]
q1 <- ggplot(pData(lncgenes))
# tSNE using only lncRNAs color by cluster number (as determined by
# Linnarsson Lab)
lincRNA_tSNE <- q1 + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, color = group_num)) +
theme_bw() + coord_equal(1) + scale_color_brewer(palette = "Set1")
lincRNA_tSNE
# tSNE using only lncRNAs colored by level1class (as determined by
# Linnarsson Lab)
lincRNA_level1class_tSNE <- q1 + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos,
color = level1class)) + theme_bw() + coord_equal(1) + scale_color_brewer(palette = "Set1")
lincRNA_level1class_tSNE
# tSNE using only lncRNAs color by level2clss
lincRNA_tSNE_level2class <- q1 + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos,
color = level2class)) + theme_bw() + coord_equal(1) + scale_color_manual(values = colorRampPalette(brewer.pal(9,
"Set1"))(length(unique(pData(dat)$level2class))))
lincRNA_tSNE_level2class
# Color by Sex, size is reflective of XIST expression
lincRNA_tSNE_sex <- q1 + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, color = sex,
size = exprs(lncgenes)[2, ])) + scale_size("Xist") + theme_bw() + coord_equal(1) +
scale_fill_brewer(palette = "Set1")
lincRNA_tSNE_sex
# Color by infered batch (utililze the run number from cell_id in order to
# infer batch)
pData(lncgenes)$cell_id <- colnames(exprs(lncgenes))
pData(lncgenes)$batch <- str_split_fixed(pData(lncgenes)$cell_id, "_", 2)[,
1]
j <- ggplot(pData(lncgenes))
lincRNA_tSNE_batch <- j + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, color = batch)) +
theme_bw() + coord_equal(1) + scale_color_manual(values = colorRampPalette(brewer.pal(9,
"Set1"))(length(unique(pData(lncgenes)$batch))))
lincRNA_tSNE_batch
# tSNE by lincRNA without Xist and Tsix (we do this because Xist and Tsix
# expression splits the cells into two groups by sex)
sex_blind.dat <- subset(dat, fData(dat)$ccdsid %in% "lincRNA")
sex_blind_list <- as.vector(rownames(exprs(sex_blind.dat)[c(-2, -6), ]))
sex_blind.dat <- subset(sex_blind.dat, rownames(exprs(sex_blind.dat)) %in% sex_blind_list)
sex_blind_lincRNA.tSNE <- Rtsne(t(round(vstExprs(sex_blind.dat))), theta = 0.1,
check_duplicates = F)
pData(sex_blind.dat)$tSNE1_pos <- sex_blind_lincRNA.tSNE$Y[, 1]
pData(sex_blind.dat)$tSNE2_pos <- sex_blind_lincRNA.tSNE$Y[, 2]
n <- ggplot(pData(sex_blind.dat))
sex_blind_tSNE <- n + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, color = group_num)) +
theme_bw() + coord_equal(1) + scale_color_brewer(palette = "Set1")
sex_blind_tSNE
Seperate the “dat” CellDataSet by “Cluster” and calculate mean expression of genes
# Seperate the 'dat' CellDataSet by 'Cluster' and calculate mean expression
# of genes
Cluster.split <- lapply(unique(pData(dat)$group_num), function(x) {
dat[, pData(dat)$group_num == x]
})
Cluster.split <- lapply(c(1:length(Cluster.split)), function(x) {
detectGenes(Cluster.split[[x]], min_expr = 1)
})
Cluster.split <- lapply(c(1:length(Cluster.split)), function(i) {
x <- Cluster.split[[i]]
x[fData(x)$num_cells_expressed >= 2]
})
Cluster.split <- lapply(Cluster.split, function(x) {
mean_cpc <- apply(exprs(x), 1, mean)
fData(x)$mean_cpc <- mean_cpc
return(x)
})
Cluster.split <- lapply(c(1:length(Cluster.split)), function(i) {
x <- Cluster.split[[i]]
frac_cells_expressed <- (fData(x)$num_cells_expressed/length(rownames(pData(x))))
fData(x)$frac_cells_expressed <- frac_cells_expressed
return(x)
})
tmp <- data.frame()
group_means <- lapply(c(1:length(Cluster.split)), function(i) {
x <- Cluster.split[[i]]
res <- data.frame(gene_short_name = fData(x)$gene_short_name, gene_type = fData(x)$ccdsid,
mean_cpc = fData(x)$mean_cpc, group_num = unique(pData(x)$group_num),
frac_cells_expressed = fData(x)$frac_cells_expressed)
tmp <- rbind(tmp, res)
})
tmp <- plyr::ldply(group_means, data.frame)
group_means <- subset(tmp, gene_type %in% c("protein_coding", "lincRNA"))
density.plot_Cluster <- ggplot(group_means) + geom_density(aes(x = log10(mean_cpc),
color = gene_type)) + facet_grid(. ~ group_num, labeller = labeller(group_num = function(x) {
paste("Cluster", x, sep = ":")
})) + scale_color_manual(values = c("red", "black")) + theme_bw()
density.plot_Cluster
# Generate box plots
boxplot_cluster <- ggplot(group_means, aes(gene_type, frac_cells_expressed,
fill = gene_type))
boxplot_cluster + geom_boxplot() + theme_bw() + scale_fill_manual(values = c("red",
"grey50")) + facet_grid(~group_num, labeller = labeller(group_num = function(x) {
paste("Cluster", x, sep = ":")
}))
# pointplot
group_means1 <- data.table(group_means, key = "gene_short_name")
group_means1 <- group_means1[, .SD[mean_cpc %in% max(mean_cpc)], by = gene_short_name]
group_means2 <- data.table(group_means, key = "gene_short_name")
group_means2 <- group_means2[, .SD[frac_cells_expressed %in% max(frac_cells_expressed) &
mean_cpc %in% max(mean_cpc)], by = gene_short_name]
adjusted_point <- ggplot(group_means2, aes(x = log10(mean_cpc), y = frac_cells_expressed,
color = gene_type)) + geom_point() + facet_wrap(~gene_type) + theme_bw() +
scale_color_manual(values = c("red", "black"))
adjusted_point
Seperate the “dat” CellDataSet by level2class and calculate mean expression of genes
# Seperate the 'dat' CellDataSet by the Interneuron level2classes and
# calculate mean_cpc within this new object
level2.split <- lapply(unique(pData(dat)[pData(dat)$level1class == "interneurons",
]$level2class), function(x) {
dat[, pData(dat)$level2class == x]
})
level2.split <- lapply(c(1:length(level2.split)), function(x) {
detectGenes(level2.split[[x]], min_expr = 1)
})
level2.split <- lapply(c(1:length(level2.split)), function(i) {
x <- level2.split[[i]]
x[fData(x)$num_cells_expressed >= 2]
})
level2.split <- lapply(level2.split, function(x) {
mean_cpc <- apply(exprs(x), 1, mean)
fData(x)$mean_cpc <- mean_cpc
return(x)
})
tmp <- data.frame()
group_means_level2class <- lapply(c(1:length(level2.split)), function(i) {
x <- level2.split[[i]]
res <- data.frame(gene_short_name = fData(x)$gene_short_name, gene_type = fData(x)$ccdsid,
mean_cpc = fData(x)$mean_cpc, level2class = unique(pData(x)$level2class))
tmp <- rbind(tmp, res)
})
tmp <- plyr::ldply(group_means_level2class, data.frame)
group_means_level2class <- subset(tmp, gene_type %in% c("protein_coding", "lincRNA"))
density.plot_level2class <- ggplot(group_means_level2class) + geom_density(aes(x = log10(mean_cpc),
color = gene_type)) + facet_wrap(~level2class, nrow = 2) + scale_color_manual(values = c("red",
"black")) + theme_bw()
density.plot_level2class
# Generate a barplot that has all of the subcell types for each of the 9
# clusters (again as determined by the Linnarsson group)
level_2_barplot <- lapply(c(1:9), function(x) {
pData(Cluster.split[[x]])$level2class %>% unique() %>% length()
})
level_2_barplot <- data.frame(level_2_barplot)
colnames(level_2_barplot) <- c("1", "2", "3", "4", "5", "6", "7", "8", "9")
rownames(level_2_barplot) <- "Number_of_Clusters"
level_2_barplot <- as.data.frame(level_2_barplot)
level_2_barplot <- as.data.frame(t(level_2_barplot))
bar.plot <- ggplot(level_2_barplot)
bar.plot + geom_bar(aes(x = rownames(level_2_barplot), y = level_2_barplot$Number_of_Clusters,
fill = rownames(level_2_barplot)), colour = "black", width = 0.5, stat = "identity") +
scale_fill_brewer(palette = "Set1", guide = guide_legend(title = "Cluster #")) +
xlab("Cluster #") + ylab("Number of cellular subtypes") + theme_bw() + theme(axis.text.x = element_blank())
# Completely split the CellDataSet by level2class and calculate the mean_cpc
# in each of the new CellDataSet objects
complete.level2.split <- lapply(unique(pData(dat)$level2class), function(x) {
dat[, pData(dat)$level2class == x]
})
complete.level2.split <- lapply(c(1:length(unique(pData(dat)$level2class))),
function(x) {
detectGenes(complete.level2.split[[x]], min_expr = 1)
})
complete.level2.split <- lapply(c(1:length(unique(pData(dat)$level2class))),
function(i) {
x <- complete.level2.split[[i]]
x[fData(x)$num_cells_expressed >= 2]
})
complete.level2.split <- lapply(complete.level2.split, function(x) {
mean_cpc <- apply(exprs(x), 1, mean)
fData(x)$mean_cpc <- mean_cpc
return(x)
})
complete.level2.split <- lapply(c(1:length(unique(pData(dat)$level2class))),
function(i) {
x = complete.level2.split[[i]]
fData(x)$gene_std_pc <- rowSds(exprs(x))
return(x)
})
complete.level2.split <- lapply(c(1:length(unique(pData(dat)$level2class))),
function(i) {
x = complete.level2.split[[i]]
fData(x)$BCV <- as.vector(fData(x)$gene_std_pc)/as.vector(fData(x)$mean_cpc)
return(x)
})
complete.level2.split <- lapply(c(1:length(unique(pData(dat)$level2class))),
function(i) {
x = complete.level2.split[[i]]
fData(x)$frac_cells_expressed <- fData(x)$num_cells_expressed/length(rownames(pData(x)))
return(x)
})
tmp <- data.frame()
bcv_level2class <- lapply(c(1:length(complete.level2.split)), function(i) {
x <- complete.level2.split[[i]]
res <- data.frame(gene_short_name = fData(x)$gene_short_name, gene_type = fData(x)$ccdsid,
mean_cpc = fData(x)$mean_cpc, BCV = fData(x)$BCV, gene_sd_pc = fData(x)$gene_std_pc,
level2class = unique(pData(x)$level2class), frac_cells_expressed = fData(x)$frac_cells_expressed)
tmp <- rbind(tmp, res)
})
names(bcv_level2class) <- unique(pData(dat)$level2class)
level2_bcv <- Reduce(function(...) merge(..., all = TRUE), bcv_level2class)
level2_bcv <- subset(level2_bcv, gene_type %in% c("lincRNA", "protein_coding"))
# Generate density plots
density.plot_all_level2class <- ggplot(level2_bcv) + geom_density(aes(x = log10(mean_cpc),
color = gene_type)) + facet_wrap(~level2class, nrow = 8) + scale_color_manual(values = c("red",
"black")) + theme_bw()
density.plot_all_level2class
# Generate box plots
boxplot_level2Class <- ggplot(level2_bcv, aes(gene_type, frac_cells_expressed,
fill = gene_type))
boxplot_level2Class + geom_boxplot() + theme_bw() + scale_fill_manual(values = c("red",
"grey50")) + facet_wrap(~level2class, nrow = 8)
# generate point plots
level2_bcv1 <- data.table(level2_bcv, key = "gene_short_name")
level2_bcv1 <- level2_bcv1[, .SD[frac_cells_expressed %in% max(frac_cells_expressed) &
mean_cpc %in% max(mean_cpc)], by = gene_short_name]
level2class_adjusted_point <- ggplot(level2_bcv1, aes(x = log10(mean_cpc), y = frac_cells_expressed,
color = gene_type)) + geom_point() + facet_wrap(~gene_type) + theme_bw() +
scale_color_manual(values = c("red", "black"))
level2class_adjusted_point
# sanity test density plot
q1 <- ggplot(level2_bcv1, aes(x = log10(mean_cpc), color = gene_type)) + geom_density() +
theme_bw() + scale_color_manual(values = c("red", "black"))
# Some odds and ends that I may want to come back and use later.
cluster.bcv.dat <- lapply(unique(pData(dat)$group_num), function(x) {
dat[, pData(dat)$group_num == x]
})
cluster.bcv.dat <- lapply(c(1:length(cluster.bcv.dat)), function(x) {
detectGenes(cluster.bcv.dat[[x]], min_expr = 1)
})
cluster.bcv.dat <- lapply(c(1:length(cluster.bcv.dat)), function(i) {
x <- cluster.bcv.dat[[i]]
x[fData(x)$num_cells_expressed >= 2]
})
cluster.bcv.dat <- lapply(cluster.bcv.dat, function(x) {
mean_cpc <- apply(exprs(x), 1, mean)
fData(x)$mean_cpc <- mean_cpc
return(x)
})
cluster.bcv.dat <- lapply(c(1:length(cluster.bcv.dat)), function(i) {
x = cluster.bcv.dat[[i]]
fData(x)$gene_std_pc <- rowSds(exprs(x))
return(x)
})
cluster.bcv.dat <- lapply(c(1:length(unique(cluster.bcv.dat))), function(i) {
x = cluster.bcv.dat[[i]]
fData(x)$BCV <- as.vector(fData(x)$gene_std_pc)/as.vector(fData(x)$mean_cpc)
return(x)
})
tmp <- data.frame()
bcv_cluster <- lapply(c(1:length(cluster.bcv.dat)), function(i) {
x <- cluster.bcv.dat[[i]]
res <- data.frame(gene_short_name = fData(x)$gene_short_name, gene_type = fData(x)$ccdsid,
mean_cpc = fData(x)$mean_cpc, BCV = fData(x)$BCV, gene_sd_pc = fData(x)$gene_std_pc,
cluster_num = rep(i))
tmp <- rbind(tmp, res)
})
bcv <- Reduce(function(...) merge(..., all = TRUE), bcv_cluster)
linc_bcv <- subset(bcv, bcv$gene_type == "lincRNA")
mRNA_bcv <- subset(bcv, bcv$gene_type == "protein_coding")
delta_median_by_cluster <- lapply(c(1:9), function(x) {
median(subset(mRNA_bcv, mRNA_bcv$cluster_num == x)$mean_cpc) - median(subset(linc_bcv,
linc_bcv$cluster_num == x)$mean_cpc)
})
names(delta_median_by_cluster) <- c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4",
"Cluster 5", "Cluster 6", "Cluster 7", "Cluster 8", "Cluster 9")
# Generate a scatter plot of Δ median copies per cell (mRNA-lncRNA) for each
# cluster
delta_median_by_cluster <- as.matrix(delta_median_by_cluster)
delta_median_by_cluster <- as.data.frame(delta_median_by_cluster)
colnames(delta_median_by_cluster) <- "delta_median_mean_cpc"
delta_median_by_cluster$delta_median_mean_cpc <- as.numeric(delta_median_by_cluster$delta_median_mean_cpc)
delta_median_by_cluster$Number_of_subtypes <- as.vector(level_2_barplot$Number_of_Clusters)
p <- ggplot(delta_median_by_cluster, aes(x = Number_of_subtypes, y = delta_median_mean_cpc))
scatter.plot <- p + geom_point(aes(color = rownames(delta_median_by_cluster))) +
geom_smooth(aes(alpha = 0.1), method = "lm", se = T, color = "black", alpha = 0.2) +
scale_color_brewer(palette = "Set1", guide = guide_legend(title = "Cluster #")) +
guides(fill = "none") + theme_bw() + xlab("Number of cellular subtypes") +
ylab("Δ median copies per cell (mRNA-lncRNA)") + theme(legend.position = "none")
scatter.plot
# generate ecdf plots
subset_bcv <- subset(bcv, bcv$gene_type %in% c("lincRNA", "protein_coding"))
ecdf_cluster <- ggplot(subset_bcv, aes(x = log10(BCV), color = gene_type)) +
stat_ecdf(geom = "line") + facet_wrap(~cluster_num, nrow = 1) + scale_color_manual(values = c("red",
"black")) + theme_bw()
ecdf_cluster
sessionInfo()
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Sierra 10.12.3
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] splines stats4 parallel stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] bios2mds_1.2.2 rgl_0.98.1
## [3] cluster_2.0.5 scales_0.4.1
## [5] e1071_1.6-8 amap_0.8-14
## [7] data.table_1.10.0 distr_2.6
## [9] SweaveListingUtils_0.7.5 sfsmisc_1.1-0
## [11] startupmsg_0.9.3 RColorBrewer_1.1-2
## [13] GenomicFeatures_1.26.2 AnnotationDbi_1.36.0
## [15] GenomicRanges_1.26.2 GenomeInfoDb_1.10.2
## [17] IRanges_2.8.1 S4Vectors_0.12.1
## [19] tidyr_0.6.1 matrixStats_0.51.0
## [21] magrittr_1.5 dplyr_0.5.0
## [23] gapmap_0.0.4 reshape2_1.4.2
## [25] Rtsne_0.11 monocle_2.2.0
## [27] DDRTree_0.1.4 irlba_2.1.2
## [29] VGAM_1.0-2 ggplot2_2.2.1
## [31] Biobase_2.34.0 BiocGenerics_0.20.0
## [33] Matrix_1.2-7.1 stringr_1.1.0
##
## loaded via a namespace (and not attached):
## [1] jsonlite_1.2 shiny_0.14.2
## [3] assertthat_0.1 Rsamtools_1.26.1
## [5] yaml_2.1.14 slam_0.1-40
## [7] RSQLite_1.1-2 backports_1.0.4
## [9] lattice_0.20-34 limma_3.30.7
## [11] digest_0.6.11 XVector_0.14.0
## [13] colorspace_1.3-2 httpuv_1.3.3
## [15] fastICA_1.2-0 htmltools_0.3.5
## [17] plyr_1.8.4 XML_3.98-1.5
## [19] pheatmap_1.0.8 HSMMSingleCell_0.108.0
## [21] qlcMatrix_0.9.5 biomaRt_2.30.0
## [23] zlibbioc_1.20.0 xtable_1.8-2
## [25] BiocParallel_1.8.1 tibble_1.2
## [27] combinat_0.0-8 SummarizedExperiment_1.4.0
## [29] lazyeval_0.2.0 mime_0.5
## [31] memoise_1.0.0 evaluate_0.10
## [33] MASS_7.3-45 class_7.3-14
## [35] tools_3.3.2 formatR_1.4
## [37] munsell_0.4.3 Biostrings_2.42.1
## [39] grid_3.3.2 RCurl_1.95-4.8
## [41] htmlwidgets_0.8 igraph_1.0.1
## [43] labeling_0.3 bitops_1.0-6
## [45] rmarkdown_1.3 gtable_0.2.0
## [47] DBI_0.5-1 R6_2.2.0
## [49] GenomicAlignments_1.10.0 knitr_1.15.1
## [51] rtracklayer_1.34.1 rprojroot_1.1
## [53] stringi_1.1.2 Rcpp_0.12.8